# Vito D'Orazio
# September 9, 2020
# builddv.R

# this script reads in full views data and constructs the appropriate DV
# writes out DVs in individual files

rm(list=ls())

library(parsedate) # for date formatting
library(data.table) # for lags and leads

setwd("/Users/vjdorazio/PycharmProjects/views")

# read in data
mydata <- read.csv("data072820/cm_africa.csv")

mydata <- mydata[,c("country_name", "country_id", "ged_best_sb", "month_id", "month", "year")]

mydata$ged_best_sb <- mydata$ged_best_sb+1
mydata$ged_best_sb <- log(mydata$ged_best_sb)

# sort data
mydata <- mydata[order(mydata$country_id, mydata$month_id),]

# set data.table
mydata <- data.table(mydata)

# create lags with data.table
cols = "ged_best_sb"
mydata[, ("lag1") := shift(.SD, 1, NA, "lag"), .SDcols=cols, by=country_id]
mydata[, ("lag2") := shift(.SD, 2, NA, "lag"), .SDcols=cols, by=country_id]
mydata[, ("lag3") := shift(.SD, 3, NA, "lag"), .SDcols=cols, by=country_id]
mydata[, ("lag4") := shift(.SD, 4, NA, "lag"), .SDcols=cols, by=country_id]
mydata[, ("lag5") := shift(.SD, 5, NA, "lag"), .SDcols=cols, by=country_id]
mydata[, ("lag6") := shift(.SD, 6, NA, "lag"), .SDcols=cols, by=country_id]


mydata$dv_1_diff <- mydata$ged_best_sb - mydata$lag1
mydata$date_1 <- mydata$month_id - 1

mydata$dv_2_diff <- mydata$ged_best_sb - mydata$lag2
mydata$date_2 <- mydata$month_id - 2

mydata$dv_3_diff <- mydata$ged_best_sb - mydata$lag3
mydata$date_3 <- mydata$month_id - 3

mydata$dv_4_diff <- mydata$ged_best_sb - mydata$lag4
mydata$date_4 <- mydata$month_id - 4

mydata$dv_5_diff <- mydata$ged_best_sb - mydata$lag5
mydata$date_5 <- mydata$month_id - 5

mydata$dv_6_diff <- mydata$ged_best_sb - mydata$lag6
mydata$date_6 <- mydata$month_id - 6

mydata$date_t <- mydata$month_id

outdata <- mydata[,c("country_id", "date_t", "ged_best_sb", "date_1", "date_2",
                     "date_3", "date_4", "date_5", "date_6", "dv_1_diff",
                     "dv_2_diff", "dv_3_diff", "dv_4_diff", "dv_5_diff", "dv_6_diff")]

colnames(outdata)[3] <- "dv"

write.csv(outdata, "data072820/dv_cm_africa.csv", row.names=FALSE)

#temp <- read.csv("data072820/dv_cm_africa.csv")


## pgm

# read in data
mydata <- read.csv("data072820/pgm_africa_dvsub.csv")

mydata$ged_best_sb <- mydata$ged_best_sb+1
mydata$ged_best_sb <- log(mydata$ged_best_sb)

# sort data
mydata <- mydata[order(mydata$pg_id, mydata$month_id),]

# set data.table
mydata <- data.table(mydata)

# create lags with data.table
cols = "ged_best_sb"
mydata[, ("lag1") := shift(.SD, 1, NA, "lag"), .SDcols=cols, by=pg_id]
mydata[, ("lag2") := shift(.SD, 2, NA, "lag"), .SDcols=cols, by=pg_id]
mydata[, ("lag3") := shift(.SD, 3, NA, "lag"), .SDcols=cols, by=pg_id]
mydata[, ("lag4") := shift(.SD, 4, NA, "lag"), .SDcols=cols, by=pg_id]
mydata[, ("lag5") := shift(.SD, 5, NA, "lag"), .SDcols=cols, by=pg_id]
mydata[, ("lag6") := shift(.SD, 6, NA, "lag"), .SDcols=cols, by=pg_id]


mydata$dv_1_diff <- mydata$ged_best_sb - mydata$lag1
mydata$date_1 <- mydata$month_id - 1

mydata$dv_2_diff <- mydata$ged_best_sb - mydata$lag2
mydata$date_2 <- mydata$month_id - 2

mydata$dv_3_diff <- mydata$ged_best_sb - mydata$lag3
mydata$date_3 <- mydata$month_id - 3

mydata$dv_4_diff <- mydata$ged_best_sb - mydata$lag4
mydata$date_4 <- mydata$month_id - 4

mydata$dv_5_diff <- mydata$ged_best_sb - mydata$lag5
mydata$date_5 <- mydata$month_id - 5

mydata$dv_6_diff <- mydata$ged_best_sb - mydata$lag6
mydata$date_6 <- mydata$month_id - 6

mydata$date_t <- mydata$month_id

outdata <- mydata[,c("pg_id", "date_t", "ged_best_sb", "date_1", "date_2",
                     "date_3", "date_4", "date_5", "date_6", "dv_1_diff",
                     "dv_2_diff", "dv_3_diff", "dv_4_diff", "dv_5_diff", "dv_6_diff")]

colnames(outdata)[3] <- "dv"

write.csv(outdata, "data072820/dv_pgm_africa.csv", row.names=FALSE)


## END SCRIPT

#temp <- read.csv("data072820/dv_pgm_africa.csv")

#length(which(outdata$dv_1 > 0))
#length(which(outdata$dv_1 == 0 & outdata$dv_2 > 0))
#length(which(outdata$dv_1 == 0 & outdata$dv_2 == 0 & outdata$dv_3 > 0))
#length(which(outdata$dv_1 == 0 & outdata$dv_2 == 0 & outdata$dv_3 == 0 & outdata$dv_4 > 0))
#length(which(outdata$dv_1 == 0 & outdata$dv_2 == 0 & outdata$dv_3 == 0 & outdata$dv_4 == 0 & outdata$dv_5 > 0))
#length(which(outdata$dv_1 == 0 & outdata$dv_2 == 0 & outdata$dv_3 == 0 & outdata$dv_4 == 0 & outdata$dv_5 == 0 & outdata$dv_6 > 0))

#temp <- outdata[which(outdata$dv_1>0 | outdata$dv_2>0 | outdata$dv_3>0 | outdata$dv_4>0 | outdata$dv_5>0 | outdata$dv_6>0),]




#pgmvars <- c('acled_count_ns', 'acled_count_os', 'acled_count_pr', 'acled_count_sb', 'acled_dummy_ns', 'acled_dummy_os', 'acled_dummy_pr', 'acled_dummy_sb', 'acled_fat_ns', 'acled_fat_os', 'acled_fat_pr', 'acled_fat_sb', 'country_id', 'country_name', 'decay_12_time_since_acled_dummy_pr', 'decay_12_time_since_ged_dummy_ns', 'decay_12_time_since_ged_dummy_os', 'decay_12_time_since_ged_dummy_sb', 'fvp_demo', 'fvp_grgdpcap_nonoilrent', 'fvp_grgdpcap_oilrent', 'fvp_lngdpcap_nonoilrent', 'fvp_lngdpcap_oilrent', 'fvp_population200', 'fvp_prop_excluded', 'fvp_semi', 'fvp_ssp2_edu_sec_15_24_prop', 'fvp_ssp2_urban_share_iiasa', 'fvp_timeindep', 'fvp_timesincepreindepwar', 'fvp_timesinceregimechange', 'ged_best_ns', 'ged_best_os', 'ged_best_sb', 'ged_count_ns', 'ged_count_os', 'ged_count_sb', 'ged_dummy_ns', 'ged_dummy_os', 'ged_dummy_sb', 'greq_100_ged_best_ns', 'greq_100_ged_best_os', 'greq_100_ged_best_sb', 'greq_1_ged_best_ns', 'greq_1_ged_best_os', 'greq_1_ged_best_sb', 'greq_1_pgd_excluded', 'greq_25_ged_best_ns', 'greq_25_ged_best_os', 'greq_25_ged_best_sb', 'greq_25_splag_1_1_ged_best_ns', 'greq_25_splag_1_1_ged_best_os', 'greq_25_splag_1_1_ged_best_sb', 'greq_500_ged_best_ns', 'greq_500_ged_best_os', 'greq_500_ged_best_sb', 'greq_500_splag_1_1_ged_best_ns', 'greq_500_splag_1_1_ged_best_os', 'greq_500_splag_1_1_ged_best_sb', 'greq_5_ged_best_ns', 'greq_5_ged_best_os', 'greq_5_ged_best_sb', 'in_africa', 'ln_fvp_population200', 'ln_fvp_timeindep', 'ln_fvp_timesincepreindepwar', 'ln_fvp_timesinceregimechange', 'ln_ged_best_ns', 'ln_ged_best_os', 'ln_ged_best_sb', 'ln_pgd_bdist3', 'ln_pgd_capdist', 'ln_pgd_pop_gpw_sum', 'ln_pgd_ttime_mean', 'month', 'pgd_agri_gc', 'pgd_agri_ih', 'pgd_aquaveg_gc', 'pgd_barren_gc', 'pgd_barren_ih', 'pgd_bdist3', 'pgd_capdist', 'pgd_cmr_mean', 'pgd_diamprim', 'pgd_diamsec', 'pgd_drug_y', 'pgd_excluded', 'pgd_forest_gc', 'pgd_forest_ih', 'pgd_gcp_mer', 'pgd_gem', 'pgd_goldplacer', 'pgd_goldsurface', 'pgd_goldvein', 'pgd_grass_ih', 'pgd_gwarea', 'pgd_harvarea', 'pgd_herb_gc', 'pgd_imr_mean', 'pgd_landarea', 'pgd_maincrop', 'pgd_mountains_mean', 'pgd_nlights_calib_mean', 'pgd_pasture_ih', 'pgd_petroleum', 'pgd_pop_gpw_sum', 'pgd_savanna_ih', 'pgd_shrub_gc', 'pgd_shrub_ih', 'pgd_temp', 'pgd_ttime_mean', 'pgd_urban_gc', 'pgd_urban_ih', 'pgd_water_gc', 'pgd_water_ih', 'spdist_pgd_diamsec', 'spdist_pgd_petroleum', 'spei_1', 'spei_12', 'spei_24', 'spei_3', 'spei_36', 'spei_48', 'splag_1_1_acled_dummy_pr', 'splag_1_1_ged_best_ns', 'splag_1_1_ged_best_os', 'splag_1_1_ged_best_sb', 'splag_1_1_ged_dummy_ns', 'splag_1_1_ged_dummy_os', 'splag_1_1_ged_dummy_sb', 'stdist_k1_t001_ged_dummy_ns', 'stdist_k1_t001_ged_dummy_os', 'stdist_k1_t001_ged_dummy_sb', 'stdist_k1_t10_ged_dummy_ns', 'stdist_k1_t10_ged_dummy_os', 'stdist_k1_t10_ged_dummy_sb', 'stdist_k1_t1_ged_dummy_ns', 'stdist_k1_t1_ged_dummy_os', 'stdist_k1_t1_ged_dummy_sb', 'time_since_acled_dummy_pr', 'time_since_ged_dummy_ns', 'time_since_ged_dummy_os', 'time_since_ged_dummy_sb', 'time_since_greq_25_ged_best_ns', 'time_since_greq_25_ged_best_os', 'time_since_greq_25_ged_best_sb', 'time_since_greq_25_splag_1_1_ged_best_ns', 'time_since_greq_25_splag_1_1_ged_best_os', 'time_since_greq_25_splag_1_1_ged_best_sb', 'time_since_greq_500_ged_best_ns', 'time_since_greq_500_ged_best_os', 'time_since_greq_500_ged_best_sb', 'time_since_greq_500_splag_1_1_ged_best_ns', 'time_since_greq_500_splag_1_1_ged_best_os', 'time_since_greq_500_splag_1_1_ged_best_sb', 'time_since_splag_1_1_ged_dummy_ns', 'time_since_splag_1_1_ged_dummy_os', 'time_since_splag_1_1_ged_dummy_sb', 'tlag_10_acled_dummy_pr', 'tlag_10_ged_dummy_ns', 'tlag_10_ged_dummy_os', 'tlag_10_ged_dummy_sb', 'tlag_11_acled_dummy_pr', 'tlag_11_ged_dummy_ns', 'tlag_11_ged_dummy_os', 'tlag_11_ged_dummy_sb', 'tlag_12_acled_dummy_pr', 'tlag_12_ged_dummy_ns', 'tlag_12_ged_dummy_os', 'tlag_12_ged_dummy_s',
             'tlag_1_acled_dummy_pr', 'tlag_1_ged_best_ns', 'tlag_1_ged_best_os', 'tlag_1_ged_best_sb', 'tlag_1_ged_dummy_ns', 'tlag_1_ged_dummy_os', 'tlag_1_ged_dummy_sb', 'tlag_1_splag_1_1_acled_dummy_pr', 'tlag_1_splag_1_1_ged_dummy_ns', 'tlag_1_splag_1_1_ged_dummy_os', 'tlag_1_splag_1_1_ged_dummy_sb', 'tlag_2_acled_dummy_pr', 'tlag_2_ged_dummy_ns', 'tlag_2_ged_dummy_os', 'tlag_2_ged_dummy_sb', 'tlag_2_splag_1_1_acled_dummy_pr', 'tlag_2_splag_1_1_ged_dummy_ns', 'tlag_2_splag_1_1_ged_dummy_os', 'tlag_2_splag_1_1_ged_dummy_sb', 'tlag_3_acled_dummy_pr', 'tlag_3_ged_dummy_ns', 'tlag_3_ged_dummy_os', 'tlag_3_ged_dummy_sb', 'tlag_3_splag_1_1_acled_dummy_pr', 'tlag_3_splag_1_1_ged_dummy_ns', 'tlag_3_splag_1_1_ged_dummy_os', 'tlag_3_splag_1_1_ged_dummy_sb', 'tlag_4_acled_dummy_pr', 'tlag_4_ged_dummy_ns', 'tlag_4_ged_dummy_os', 'tlag_4_ged_dummy_sb', 'tlag_5_acled_dummy_pr', 'tlag_5_ged_dummy_ns', 'tlag_5_ged_dummy_os', 'tlag_5_ged_dummy_sb', 'tlag_6_acled_dummy_pr', 'tlag_6_ged_dummy_ns', 'tlag_6_ged_dummy_os','tlag_6_ged_dummy_sb', 'tlag_7_acled_dummy_pr', 'tlag_7_ged_dummy_ns', 'tlag_7_ged_dummy_os', 'tlag_7_ged_dummy_sb', 'tlag_8_acled_dummy_pr', 'tlag_8_ged_dummy_ns', 'tlag_8_ged_dummy_os', 'tlag_8_ged_dummy_sb', 'tlag_9_acled_dummy_pr', 'tlag_9_ged_dummy_ns', 'tlag_9_ged_dummy_os',
             'tlag_9_ged_dummy_sb','year')
